Purpose of this notebook

The website http://figure.nz is a good starting point for the health data sources/datasets review project because:

One interesting feature of their website is the tagging of each item. This set of tags could be really useful in the context of our project:

Two problems:

query <- "match (tag:Tag) return upper(tag.name) as tag"

tags <- cypher(graph, query) %>% 
        mutate(tag=str_trim(tag)) %>% 
        unique() # uppercasings can results in edges duplicates

print(paste("total number of tags:", nrow(tags)))
## [1] "total number of tags: 411"

The strategy consists in:

Grouping based on spelling/abbreviation

Let’s build the lookup table to perform this grouping:

head(lkup.corrections)
##                     old         new
## 1           AFTER HOURS AFTER-HOURS
## 2                DEATHS       DEATH
## 3                  DHBS         DHB
## 4              DISEASES     DISEASE
## 5 DISTRICT HEALTH BOARD         DHB
## 6               DOCTORS      DOCTOR

Grouping based on graph analytics - part 1

Some tags always occur together figure.nz items. Might be a good idea to group them. This is related to the concept of clique in graph theory. Lets have a look:

edges <- coocurences %>%
         inner_join(degrees, by=c("from" = "tag")) %>%
         mutate(weight_from_to=coocurrence/degree) %>%
         select(from, to, coocurrence, weight_from_to) %>%
         inner_join(degrees, by=c("to" = "tag")) %>%
         mutate(weight_to_from=coocurrence/degree) %>%
         filter(weight_from_to==1, weight_to_from==1)

nodes <- data.frame(id=unique(c(edges$from, edges$to)))

nodes$label <- nodes$id

ig <- graph_from_data_frame(edges, directed=FALSE)

clusters <- clusters(ig)

table(clusters$csize)
## 
##  2  3  4  5 
## 35 13  4  3
visNetwork(nodes, edges, main=list(text="Cliques among the health related tags graph", style="font-family:serif, Georgia, Times New Roman, Times;font-size:20px;text-align:left;color:darkblue;text-decoration:underline;"))

Which results in new categories after grouping:

head(lkup.group)
##   group                             label
## 1     9                  DRINKING/ALCOHOL
## 2    11 ENERGY TRANSFORMATION/ELECTRICITY
## 3    13                     FARMING/COSTS
## 4    15               HEARING/AUDIOLOGIST
## 5    19                     INFLUENZA/FLU
## 6    20                INSURANCE/COVERAGE

Grouping based on graph analytics - part 2

Let’s look for “tags communities” within the graph:

lkup.corrections.group <- nodes %>%
                          select(id, group) %>%
                          inner_join(lkup.group, by=c("group" = "group")) 

edges <- coocurences %>%
         left_join(lkup.corrections.group, by=c("from" = "id")) %>%
         mutate(from=ifelse(is.na(label), from, label)) %>%
         select(from, to, coocurrence) %>%
         left_join(lkup.corrections.group, by=c("to" = "id")) %>%
         mutate(to=ifelse(is.na(label), to, label)) %>%
         select(from, to, coocurrence) %>%
         filter(from!=to) %>% # grouping may create non existing edges
         group_by(from, to) %>%
         summarise(weight=sum(coocurrence))

nodes <- data.frame(id=unique(c(edges$from, edges$to))) 

nodes$label <- nodes$id

ig <- graph_from_data_frame(edges, directed=F)

clusters <- cluster_infomap(ig)

nodes$group <- clusters$membership

print(paste("total number of communities:", max(clusters$membership)))
## [1] "total number of communities: 47"
visNetwork(nodes, edges, main=list(text="Health related tags graph after first grouping", style="font-family:serif, Georgia, Times New Roman, Times;font-size:20px;text-align:left;color:darkblue;text-decoration:underline;"))

Some really make sense:

paste(clusters[9][[1]], collapse=", ")
## [1] "EATING, FAST FOOD, FITNESS, FOOD, FRUIT, NUTRITION, TAKEAWAYS, VEGETABLES, WEIGHT, BREAKFAST, DIET"
paste(clusters[24][[1]], collapse=", ")
## [1] "EMISSIONS, INTERNATIONAL, PM10, POLLUTION, AIR"
paste(clusters[35][[1]], collapse=", ")
## [1] "R&D, RESEARCH, PHD/DOCTORATE"
paste(clusters[18][[1]], collapse=", ")
## [1] "ADHD, ANXIETY, DEPRESSION, MANIC DEPRESSION, MENTAL HEALTH, PRIVATE HOSPITAL, BIPOLAR, CLINICAL PSYCHOLOGIST"
paste(clusters[27][[1]], collapse=", ")
## [1] "DECAY, DENTAL NURSE, DENTIST, ORAL HEALTH, TEETH"

Final grouping

Swith to excel here (don’t tell anyone) to dive into each community to keep grouping tags. Final lookup table is here lkup_after_manual_intervention.csv.

print(paste("final number of tags:", nrow(lkup.final %>% filter(!is.na(new)) %>% select(new) %>% unique())))
## [1] "final number of tags: 145"
nodes$value <- betweenness(ig)

nodes$degree <- degree(ig)

lkup.final %>% 
group_by(new) %>%
summarize(tags=n()) %>%
inner_join(nodes, by=c("new" = "id")) %>% 
select(tag=new, tags, value, degree, group) %>%
plot_ly(x=~degree, y=~value, size=~10*sqrt(tags),
        type='scatter', mode='markers', sizes=c(1, 60),
        marker=list(symbol='circle', sizemode='diameter', line=list(width=2, color='#FFFFFF')),
        text=~paste(tag, ':  #', tags, ' tags'),
        hoverinfo='text') %>%
layout(title='Tags profiles in the final graph',
       xaxis=list(title='degree',
                  gridcolor='rgb(255, 255, 255)',
                  type='log',
                  zerolinewidth = 1,
                  ticklen = 5,
                  gridwidth = 2),
       yaxis=list(title='betweeness',
                  gridcolor = 'rgb(255, 255, 255)',
                  type='log',
                  zerolinewidth = 1,
                  ticklen = 5,
                  gridwith = 2),
      paper_bgcolor = 'rgb(243, 243, 243)',
      plot_bgcolor = 'rgb(243, 243, 243)')